#loading packages and dataset
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.5 ✔ purrr 0.3.4
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.4.0
## ✔ readr 2.1.2 ✔ forcats 0.5.2
## Warning: package 'readr' was built under R version 4.0.5
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(usmap)
library(plotly)
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
covid<-read.csv("data/Provisional_COVID-19_Deaths_by_Sex_and_Age.csv")
#combined New York City with New York as they were originally separated
covid$State[covid$State=="New York City"]<-'New York'
#refactored age ranges into (teenager, adult, senior, child, infant, young adult)
covid$age<-factor(covid$Age.Group, levels=c("infant", "child", "young adult", "adult", "senior", "overall"))
covid$age[covid$Age.Group=="65-74 years"|covid$Age.Group=="75-84 years"|covid$Age.Group=="85 years and over"]<-"senior"
covid$age[covid$Age.Group=="55-64 years"|covid$Age.Group=="30-39 years"|covid$Age.Group=="35-44 years"|covid$Age.Group=="45-54 years"|covid$Age.Group=="40-49 years"|covid$Age.Group=="50-64 years"]<-"adult"
covid$age[covid$Age.Group=="Under 1 year"]<-"infant"
covid$age[covid$Age.Group=="1-4 years"|covid$Age.Group=="0-17 years"|covid$Age.Group=="5-14 years"]<-"child"
covid$age[covid$Age.Group=="18-29 years"|covid$Age.Group=="15-24 years"|covid$Age.Group=="25-34 years"]<-"young adult"
covid$age[covid$Age.Group=="All Ages"]<-"overall"
#created another new dataframe "covid_age" to use for following chart
covid_age<-covid%>%
filter(State=="United States", Sex=="Male"| Sex=="Female", age!="overall")%>%
group_by(age, Sex)%>%
summarize(total_covid_deaths=sum(COVID.19.Deaths, na.rm=T))
#created a bargraph to show the deaths due to covid within each age group & gender
agevsdeath<-ggplot(covid_age, aes(fill=Sex, y=total_covid_deaths, x=age)) +
geom_bar(position=position_dodge(), stat="identity")+
labs(x="Age Group", y="Total Covid Deaths", title="Total Covid Deaths by Age")
In every age group there are more males than females that die to covid, even in infants and children which suggests a genetic component to the differences in covid mortality.
#creating a interactive plot from the ggplot above (covid deaths vs age & gender)
ggplotly(agevsdeath)
#created a new dataframe to base following interactive map on
geo_covid<-covid%>%
filter(Age.Group=="All Ages", Sex=="Male"|Sex=="Female", State!="United States", Group!="By Total")%>%
group_by(State,Sex)%>%
summarize(total_covid_death=sum(COVID.19.Deaths, na.rm=T))%>%
mutate(code=state.abb[match(State, state.name)])%>%
pivot_wider(names_from = "Sex", values_from="total_covid_death")%>%
mutate(percent_male=round(100*Male/(Female+Male)))%>%
mutate(percent_female=round(100*Female/(Female+Male)))%>%
mutate(total_death=Female+Male)
The trend that appears is that males are more disportionately affected in the “west” states than the east
#created an interactive map showing number of covid deaths per state throughout the years
plot_geo(data=geo_covid, locationmode='USA-states')%>%
add_trace(locations=~code,
z=~percent_male,
text=~paste("State:", code, "<br>",
"Female Deaths:", Female, "<br>",
"Male Deaths:", Male, "<br>",
"Percent Male Deaths(%):", percent_male, "<br>",
"Percent Female Deaths(%):", percent_female, "<br>",
"Total Covid Deaths:", total_death))%>%
colorbar(title="Number of Covid Deaths")%>%
layout(title="2020-2022 US Covid Deaths by State", geo=list(scope='usa'))
Not a single state has more female deaths than male deaths
#top 10 states with greatest percent male death
geo_covid%>%
arrange(desc(percent_male))
## # A tibble: 52 × 7
## # Groups: State [52]
## State code Female Male percent_male percent_female total_death
## <chr> <chr> <int> <int> <dbl> <dbl> <int>
## 1 Nevada NV 9480 14888 61 39 24368
## 2 Utah UT 4580 7037 61 39 11617
## 3 Hawaii HI 1417 2168 60 40 3585
## 4 Alaska AK 1134 1637 59 41 2771
## 5 Arizona AZ 24831 35431 59 41 60262
## 6 California CA 90060 126321 58 42 216381
## 7 Idaho ID 4621 6445 58 42 11066
## 8 Texas TX 88134 119445 58 42 207579
## 9 Colorado CO 13220 17304 57 43 30524
## 10 Montana MT 3375 4472 57 43 7847
## # … with 42 more rows
#top 10 states with lowest percent male death
geo_covid%>%
arrange(percent_male)
## # A tibble: 52 × 7
## # Groups: State [52]
## State code Female Male percent_male percent_female total_death
## <chr> <chr> <int> <int> <dbl> <dbl> <int>
## 1 Rhode Island RI 3973 4051 50 50 8024
## 2 Connecticut CT 12224 12703 51 49 24927
## 3 Massachusetts MA 20546 21613 51 49 42159
## 4 Delaware DE 3214 3547 52 48 6761
## 5 Kentucky KY 19331 20854 52 48 40185
## 6 Maine ME 3011 3273 52 48 6284
## 7 Mississippi MS 14304 15301 52 48 29605
## 8 Pennsylvania PA 50288 55089 52 48 105377
## 9 Arkansas AR 11692 13414 53 47 25106
## 10 Indiana IN 25121 28290 53 47 53411
## # … with 42 more rows
#taking 2 states with the greatest percentage of male deaths, 2 with the lowest, and 2 in the middle and placing them into a new dataset
state_age<-covid%>%
filter(State=="Nevada"|State=="Utah"|State=="Connecticut"|State=="Massachusetts"|State=="South Carolina"|State=="Tennessee", Sex=="Male"| Sex=="Female", age!="overall")%>%
group_by(age, Sex, State)%>%
summarize(total_covid_deaths=sum(COVID.19.Deaths, na.rm=T))
It appears that in the states where males have a significantly greater covid mortality percentage, senior males are what accounts for the difference.
#creating mulitple bargraphs to see which age group is most effected in the states with a higher percentage of male covid deaths
plotly_age_state<-ggplot(state_age, aes(fill=Sex, y=total_covid_deaths, x=age)) +
geom_bar(position=position_dodge(), stat="identity")+
facet_wrap(~State)+
labs(x="Age Group", y="Total Covid Deaths", title="Total Covid Deaths by Age")
ggplotly(plotly_age_state)
Regardless of the season, there are more males that die due to covid than females
#subsetting the months into seasons
covid$seasons<-factor(covid$Month, levels = c("summer", "fall", "spring", "winter"))
covid$seasons[covid$Month<=2|covid$Month==12]<-"winter"
covid$seasons[covid$Month>=6 & covid$Month<=8]<-"summer"
covid$seasons[covid$Month>=9 & covid$Month<=11]<-"fall"
covid$seasons[covid$Month>=3 & covid$Month<=5]<-"spring"
#creating a double bar chart to compare male and female covid deaths in different seasons
covid%>%
filter(Group=="By Month", Sex!="All Sexes")%>%
ggplot(aes(x=seasons, y=COVID.19.Deaths, fill=Sex))+
geom_col(position=position_dodge(), stat="identity")+
labs(title="Number of All Time Covid Deaths by Season",x="Seasons", y="Number of Covid Deaths")
#creating a double bar chart to compare male and female covid deaths throughout the years
covid%>%
filter(Group=="By Year", Sex!="All Sexes")%>%
ggplot(aes(x=Year, y=COVID.19.Deaths, fill=Sex))+
geom_col(position=position_dodge(), stat="identity")+
labs(title="Number of All Time Covid Deaths by Year",x="year", y="Number of Covid Deaths")